{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "from pathlib import Path\n",
    "import os\n",
    "import shutil\n",
    "\n",
    "\n",
    "def cleanquestion(x: str) -> str:\n",
    "    \"去除文本中的符号 仅保留中文、英文、数字等\"\n",
    "    str_text = re.sub(\n",
    "        u\"([^\\u4e00-\\u9fa5\\u0030-\\u0039\\u0041-\\u005a\\u0061-\\u007a])\", \"\", x)\n",
    "    return str_text\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7761</th>\n",
       "      <td>0</td>\n",
       "      <td>尼斯酒店的几大特点：噪音大、环境差、配置低、服务效率低。如：1、隔壁歌厅的声音闹至午夜3点许...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7762</th>\n",
       "      <td>0</td>\n",
       "      <td>盐城来了很多次，第一次住盐阜宾馆，我的确很失望整个墙壁黑咕隆咚的，好像被烟熏过一样家具非常的...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7763</th>\n",
       "      <td>0</td>\n",
       "      <td>看照片觉得还挺不错的，又是4星级的，但入住以后除了后悔没有别的，房间挺大但空空的，早餐是有但...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7764</th>\n",
       "      <td>0</td>\n",
       "      <td>我们去盐城的时候那里的最低气温只有4度，晚上冷得要死，居然还不开空调，投诉到酒店客房部，得到...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7765</th>\n",
       "      <td>0</td>\n",
       "      <td>说实在的我很失望，之前看了其他人的点评后觉得还可以才去的，结果让我们大跌眼镜。我想这家酒店以...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>7765 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      label                                             review\n",
       "0         1  距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...\n",
       "1         1                       商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!\n",
       "2         1         早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。房间本身很好。\n",
       "3         1  宾馆在小街道上，不大好找，但还好北京热心同胞很多~宾馆设施跟介绍的差不多，房间很小，确实挺小...\n",
       "4         1               CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风\n",
       "...     ...                                                ...\n",
       "7761      0  尼斯酒店的几大特点：噪音大、环境差、配置低、服务效率低。如：1、隔壁歌厅的声音闹至午夜3点许...\n",
       "7762      0  盐城来了很多次，第一次住盐阜宾馆，我的确很失望整个墙壁黑咕隆咚的，好像被烟熏过一样家具非常的...\n",
       "7763      0  看照片觉得还挺不错的，又是4星级的，但入住以后除了后悔没有别的，房间挺大但空空的，早餐是有但...\n",
       "7764      0  我们去盐城的时候那里的最低气温只有4度，晚上冷得要死，居然还不开空调，投诉到酒店客房部，得到...\n",
       "7765      0  说实在的我很失望，之前看了其他人的点评后觉得还可以才去的，结果让我们大跌眼镜。我想这家酒店以...\n",
       "\n",
       "[7765 rows x 2 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rawdata = pd.read_csv(\"data_all/ChnSentiCorp_htl_all.csv\").pipe(\n",
    "    lambda x: x.dropna()\n",
    ").pipe(\n",
    "    lambda x: x\n",
    "    # lambda x: x.assign(**{\n",
    "    #     'review':x['review'].apply(lambda j: j[:512])\n",
    "    # })\n",
    ")\n",
    "rawdata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>freq</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>label</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2443</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>5322</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       freq\n",
       "label      \n",
       "0      2443\n",
       "1      5322"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rawdata.groupby(['label']).agg(\n",
    "    freq = ('review', 'count')\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_data_dir = Path(\"data_all/data\")\n",
    "\n",
    "if target_data_dir.exists():\n",
    "    shutil.rmtree(target_data_dir)\n",
    "\n",
    "os.makedirs(name=target_data_dir, exist_ok=True)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6620</th>\n",
       "      <td>0</td>\n",
       "      <td>这家酒店我一共住了3次,在这里不得不说说他的不好,因为位置离我要去的地方近,所以选择了这家酒...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2230</th>\n",
       "      <td>1</td>\n",
       "      <td>还不错,就是房间和我预先看的不太一样,隔音不是很好~其它还不错~根据地段和价位,性价比还不错...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2441</th>\n",
       "      <td>1</td>\n",
       "      <td>环境相当好,很适合到那里渡假,软硬件设施都不错,我住的是小套房,房假也合理,就是早餐跟其它四...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5904</th>\n",
       "      <td>0</td>\n",
       "      <td>地点在十字路口边上，十分嘈杂，巴士车呼啦啦直到夜里11点。房间很小，洗漱用品不全：沐浴液、洗...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3873</th>\n",
       "      <td>1</td>\n",
       "      <td>一个非常不错的太湖边酒店,我的运气不错，本来订的是一般房间，结果被升级到了行政套房.酒店拥有...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2502</th>\n",
       "      <td>1</td>\n",
       "      <td>10月17日入住，总体感觉可以，离景区较远，正好可以从西塘政治中心一路走到老镇，感受岁月变化...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6451</th>\n",
       "      <td>0</td>\n",
       "      <td>订了商务大床间，可惜房间小的可怜！床和写字台之间太紧，基本上不能转身。更无法忍受的是房间卫生...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1730</th>\n",
       "      <td>1</td>\n",
       "      <td>环境不错、床单毛巾洁净、服务还行、房间宽敞，但感觉茶几、桌椅、地毯、浴帘有点脏，床垫不太舒服...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2446</th>\n",
       "      <td>1</td>\n",
       "      <td>国际五星级酒店情侣档专业酒店评价（TT）：先提一句，本来并没有订峨眉山大酒店，是从峨眉山天颐...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4068</th>\n",
       "      <td>1</td>\n",
       "      <td>很好的酒店，就是不知道为什么一直在涨价，价格越来越离谱，在常州属于极贵的酒店了。是不是携程和...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>6988 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      label                                               text\n",
       "6620      0  这家酒店我一共住了3次,在这里不得不说说他的不好,因为位置离我要去的地方近,所以选择了这家酒...\n",
       "2230      1  还不错,就是房间和我预先看的不太一样,隔音不是很好~其它还不错~根据地段和价位,性价比还不错...\n",
       "2441      1  环境相当好,很适合到那里渡假,软硬件设施都不错,我住的是小套房,房假也合理,就是早餐跟其它四...\n",
       "5904      0  地点在十字路口边上，十分嘈杂，巴士车呼啦啦直到夜里11点。房间很小，洗漱用品不全：沐浴液、洗...\n",
       "3873      1  一个非常不错的太湖边酒店,我的运气不错，本来订的是一般房间，结果被升级到了行政套房.酒店拥有...\n",
       "...     ...                                                ...\n",
       "2502      1  10月17日入住，总体感觉可以，离景区较远，正好可以从西塘政治中心一路走到老镇，感受岁月变化...\n",
       "6451      0  订了商务大床间，可惜房间小的可怜！床和写字台之间太紧，基本上不能转身。更无法忍受的是房间卫生...\n",
       "1730      1  环境不错、床单毛巾洁净、服务还行、房间宽敞，但感觉茶几、桌椅、地毯、浴帘有点脏，床垫不太舒服...\n",
       "2446      1  国际五星级酒店情侣档专业酒店评价（TT）：先提一句，本来并没有订峨眉山大酒店，是从峨眉山天颐...\n",
       "4068      1  很好的酒店，就是不知道为什么一直在涨价，价格越来越离谱，在常州属于极贵的酒店了。是不是携程和...\n",
       "\n",
       "[6988 rows x 2 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rawdata.sample(frac=0.9).pipe(\n",
    "    lambda x: x.rename(columns={\n",
    "        'review':'text'\n",
    "    })\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "rawdata.sample(frac=0.9).pipe(\n",
    "    lambda x: x.rename(columns={\n",
    "        'review':'text'\n",
    "    })\n",
    ").to_csv(\"data_all/data/train_data.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "rawdata.sample(frac=0.1).pipe(\n",
    "    lambda x: x.rename(columns={\n",
    "        'review':'text'\n",
    "    })\n",
    ").to_csv(\"data_all/data/test_data.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "rawdata.sample(frac=0.2).pipe(\n",
    "    lambda x: x.rename(columns={\n",
    "        'review':'text'\n",
    "    })\n",
    ").to_csv(\"data_all/data/valid_data.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "mynet2",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "110bc624a448454d574a0cd6cc76359fd86f75739e493913b3d71c2e04f2ffdb"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
